# Function calculating the MSE between the predicted intensities and the observed intensities used in the calibration of the threshold 
optim_mse = function(lambda, pred_data){
  pred_data$higher_than_stage_1 = pred_data$pred_stage_1>lambda[1]
  pred_data$higher_than_stage_2 = (pred_data$pred_stage_2>lambda[2]) &pred_data$higher_than_stage_1
  
  pred_data$pred_final = NA
  pred_data$pred_final[!pred_data$higher_than_stage_1] = 0
  pred_data$pred_final[!pred_data$higher_than_stage_2] = 0
  pred_data$pred_final[is.na(pred_data$pred_final)] = 
    pred_data$pred_stage_3[is.na(pred_data$pred_final)]
  return(mean((log1p( pred_data$future_ged_best_sb) - log1p(pred_data$pred_final))^2))
}

# Function calculating the Balance Statistic between the predicted intensities and the observed intensities used in the calibration of the threshold 
optim_balance = function(lambda, pred_data){
  pred_data$higher_than_stage_1 = pred_data$pred_stage_1>lambda[1]
  pred_data$higher_than_stage_2 = (pred_data$pred_stage_2>lambda[2]) &pred_data$higher_than_stage_1
  
  pred_data$pred_final = NA
  pred_data$pred_final[!pred_data$higher_than_stage_1] = 0
  pred_data$pred_final[!pred_data$higher_than_stage_2] = 0
  pred_data$pred_final[is.na(pred_data$pred_final)] = 
    pred_data$pred_stage_3[is.na(pred_data$pred_final)]
  return(abs((sum(log1p( pred_data$future_ged_best_sb)) - sum(log1p(pred_data$pred_final)))))
}


# The single argument to this function, points, is a data.frame in which:
#   - column 1 contains the longitude in degrees
#   - column 2 contains the latitude in degrees
coords_to_continent = function(points)
{  
  #countriesSP <- getMap(resolution='low')
  countriesSP <- getMap(resolution='high') #you could use high res map from rworldxtra if you were concerned about detail
  
  # converting points to a SpatialPoints object
  # setting CRS directly to that from rworldmap
  pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))  
  
  
  # use 'over' to get indices of the Polygons object containing each point 
  indices = over(pointsSP, countriesSP)
  
  list("continent1" = indices$continent ,
       "continent2" = indices$REGION, 
       "country" = indices$ADMIN, 
       "iso3" = indices$ISO3 )
}


coords_to_continent_missing = function(data_missing,countries)
{  
  #countriesSP <- getMap(resolution='low')
  countriesSP <- getMap(resolution='high') #you could use high res map from rworldxtra if you were concerned about detail
  countriesSP =  countriesSP[countriesSP$NAME %in% countries,] 
  centroids <- getSpPPolygonsLabptSlots(countriesSP)
  points = data.table(long = data_missing$longitude, 
                      lat = data_missing$latitude)
  
  # converting points to a SpatialPoints object
  # setting CRS directly to that from rworldmap
  pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))  
  centroids= SpatialPoints(centroids, proj4string=CRS(proj4string(countriesSP)))  
  
  res = spDists(pointsSP, centroids)
  res = apply(res, 1,which.min)
  
  list("continent1" = countriesSP[[37]][res] ,
       "continent2" = countriesSP[[38]][res], 
       "country" = countriesSP[[6]][res], 
       "iso3" = countriesSP[[1]][res] )
}

data_prep = function(cm_data,pgm_data, S) {
  # 1. Level: Country-Level In which country will there be any deaths?
  cm_data_1 = prepare_model_cm(cm_data =cm_data,S = S) 
  data_stage_1 = cm_data_1
  # 2. Level: Given that a country has deaths, in which prio grids will they be? 
  # Which countries had deaths in which month? 
  tmp_data = cm_data_1[ged_dummy_sb == 1, c("country_id", "key_cm")]
  pgm_data_1 = prepare_model_pgm(pgm_data,S = S)
  data_stage_2 = pgm_data_1[key_cm %in% tmp_data$key_cm]
  
  # 3. Level: Given that a prio grid has deaths how many in particular? 
  
  data_stage_3 = data_stage_2[future_ged_dummy_sb>0]
  
  # Split the three stages up into train/test/calibration data sets 
  
  # Generally the last month in the dataset is used for prediction (test split)
  test_data_stage_1 = data_stage_1[date_target == max(date_target)]
  # the three months before are used for calibration
  calibrate_data_stage_1 = data_stage_1[date_target == max(date_target) - months(S)]
  # and all observations before that are purely for training 
  train_data_stage_1 = data_stage_1[date_target < (max(date_target) - months(S))]
  # the months where t-s is avaliable but not t are not used hence saved as do_nothing_data_x
  do_nothing_data_1 = data_stage_1[date_target %within% interval(max(date_target) - months(S) + months(1),
                                                                 max(date_target) - months(1))]
  
 
  test_data_stage_2 = data_stage_2[date == max(date)]
  calibrate_data_stage_2 = data_stage_2[date_target == (max(date_target) - months(S))]
  train_data_stage_2 = data_stage_2[date_target < (max(date_target) - months(S))]
  # the months where t-s is avaliable but not t are not used hence saved as do_nothing_data_x
  do_nothing_data_2 = data_stage_2[date_target %within% interval(max(date_target) - months(S) + months(1),
                                                                 max(date_target) - months(1))]
  
  test_data_stage_3 = data_stage_3[date == max(date)]
  calibrate_data_stage_3 = data_stage_3[date_target == (max(date_target) - months(S))]
  train_data_stage_3 = data_stage_3[date_target < (max(date_target) - months(S))]
  do_nothing_data_3 = data_stage_3[date_target %within% interval(max(date_target) - months(S) + months(1),
                                                                 max(date_target) - months(1))]
  
  test_two_stage = pgm_data_1[date == max(date)]
  calibrate_two_stage = pgm_data_1[date_target == (max(date_target) - months(S))]
  train_two_stage = pgm_data_1[date_target < (max(date_target) - months(S))]
  
  # This dataset includes are PRIO-grid observations for training 
  all_pgm_data_train = pgm_data_1[date_target <= (max(date_target) - months(S))]
  # Complete dataset for calibration 
  pgm_calibrate_1 = pgm_data_1[date_target  == (max(date_target) - months(S))]
  cm_calibrate_1 =cm_data_1[date_target  == (max(date_target) - months(S))]
  # Complete dataset for testing 
  pgm_data_1 = pgm_data_1[date == max(date)]
  cm_data_1 =cm_data_1[date == max(date)]
  
  return(list(stage_1 = data_stage_1,stage_2 = data_stage_2,stage_3 = data_stage_3, 
              test_data_stage_1 = test_data_stage_1, 
              test_data_stage_2 = test_data_stage_2,
              test_data_stage_3 = test_data_stage_3, 
              train_data_stage_1 = train_data_stage_1, 
              train_data_stage_2 = train_data_stage_2, 
              train_data_stage_3 = train_data_stage_3, 
              test_two_stage = test_two_stage, 
              train_two_stage = train_two_stage, 
              calibrate_two_stage = calibrate_two_stage, 
              calibrate_data_stage_1 = calibrate_data_stage_1,
              calibrate_data_stage_2 = calibrate_data_stage_2, 
              calibrate_data_stage_3 = calibrate_data_stage_3, 
              cm_data_comp = cm_data_1, 
              pgm_data_comp = pgm_data_1,
              all_pgm_data_train = all_pgm_data_train, 
              pgm_calibrate_1 = pgm_calibrate_1, cm_calibrate_1 = cm_calibrate_1, 
              do_nothing_data_1= do_nothing_data_1, 
              do_nothing_data_2= do_nothing_data_2, 
              do_nothing_data_3 = do_nothing_data_3))
}




# with this function we lag the cm (country-month) data by S months between the target variable and all covariates 
prepare_model_cm = function(cm_data,S){
  
  cm_data = cm_data[order(country_id,month_id)] # Ordering by month_id should naturally already be done before hand 
  
  # cm_data_old = cm_data[order(month_id)] 
  # cm_data_old = cm_data_old[order(country_id)]
  # 
  # cm_data = cm_data[order(cm_data$country_id)] # Ordering by country_id is of utmost importance!!!!
  # month_min = min(cm_data$month_id)
  month_max =  max(cm_data$month_id)
  # var_ind = which((names(cm_data) == "ged_dummy_sb") | (names(cm_data) == "month_id"))
  cm_data[, future_target:=c(ged_dummy_sb[-(1:S)],rep(NA, S)), by=country_id]
  cm_data[, date_target:=c(date[-(1:S)],rep(NA, S)), by=country_id]
  
  #cm_data[,.(date,country_name,lag, ged_dummy_sb)]
  # c(cm_data[country_id == "191"]$ged_dummy_sb[-(1:S)],rep(NA, S))
  # cm_data[country_id == "191"]$date_new
  
  
  # cm_data[month_id %in% c((month_max-S +1) : month_max),lag]
  # cm_data[is.na(lag) & date_new != "2019-10-01", key_cm]
  # cm_data[key_cm == "161_191"]
  # 
  # cm_data[key_cm == "1990_192"]
  
  cm_data_lag = cm_data[!is.na(future_target)]
  #    cm_data[!month_id %in% c((month_max-S +1) : month_max)]
  
  # cm_data_lag[,.(country_id, month_id, lag, ged_dummy_sb)]
  
  
  cm_data_lag$ged_dummy_sb = cm_data_lag$future_target
  return(cm_data_lag)
  #   Test
  #   tail(data_1[,1:10])
  #   tail(cm_data[,1:10])
}

# # 
# unique_1_cm = unique(cm_data$key_cm)
# unique_2_cm = unique(cm_data_lag$key_cm)
# 
# unique_1_cm[!unique_1_cm %in% unique_2_cm]
# unique_2_cm[!unique_2_cm %in% unique_1_cm]
# 
# 
# country_id,month_id
prepare_model_pgm = function(pgm_data,S){
  pgm_data = pgm_data[order(pg_id,month_id)] # Ordering by month_id should naturally already be done before hand 
  # pgm_data = pgm_data[order(pg_id)] # Ordering by country_id is of utmost importance!!!!
  # month_min = min(pgm_data$month_id)
  month_max =  max(pgm_data$month_id)
  # var_ind = which((names(pgm_data) == "ged_best_sb") | (names(pgm_data) == "month_id") | (names(pgm_data) == "ged_dummy_sb"))
  # 
  pgm_data[, future_ged_dummy_sb:=c(ged_dummy_sb[-(1:S)],rep(NA, S)), by=pg_id]
  pgm_data[, future_ged_best_sb:=c(ged_best_sb[-(1:S)],rep(NA, S)), by=pg_id]
  pgm_data[, date_target:=c(date[-(1:S)],rep(NA, S)), by=pg_id]
  
  pgm_data_lag = pgm_data[!is.na(future_ged_dummy_sb)]
  
  
  return(pgm_data_lag)
  
  #   Test
  # tail(data_1[,1:10])
  # tail(pgm_data[,1:10])
}



data_prep_forecast = function(cm_data,pgm_data, S) {
  # 1. Level: Country-Level In which country will there be any deaths?
  forecast_date = max(cm_data$date) + months(S)
  cm_data_1 = prepare_model_cm_forecast(cm_data =cm_data,S = S, forecast_date = forecast_date)
  # Do not include observations that are missing more than 10% of the covariates 
  # cm_data_1 = cm_data_1[cm_data_1$percent_na<0.1,]
  # cm_data_1 = cm_data_1[,c("name","country_id","country_iso3","ged_dummy_sb","month_id","avr_lon","avr_lat" ,
  #                          "fvp_gdp200","fvp_population200","allainces_count", "polity","milit_exp", 
  #                          "mcw_receiver","mcw_sender", "salw_sender","salw_receiver", 
  #                          "mcw_receiver_rolling","mcw_sender_rolling","mcw_receiver_acute","mcw_sender_acute",
  #                          "name_fac","time_since_ged_dummy_sb", 
  #                          "time_since_ged_dummy_ns", "time_since_ged_dummy_os", 
  #                          "ged_dummy_ns", "ged_dummy_os")]
  # library(Amelia)
  # imp_data_1 = amelia(cm_data_1,idvars = c("name","country_id","country_iso3","name_fac"),m =1, 
  #                     ts = "month_id", logs = c("allainces_count","fvp_gdp200","fvp_population200", 
  #                                               "milit_exp", "mcw_receiver",
  #                                               "mcw_sender", "salw_sender","salw_receiver",
  #                                               "mcw_receiver_rolling","mcw_sender_rolling", 
  #                                               "mcw_receiver_acute","mcw_sender_acute"))
  # 
  data_stage_1 = cm_data_1
  
  # unique_1_cm = unique(cm_data_1$key_cm)
  # unique_2_cm = unique(cm_data$key_cm)
  # 
  # unique_1_cm[!unique_1_cm %in% unique_2_cm]
  # unique_2_cm[!unique_2_cm %in% unique_1_cm]
  
  # 2. Level: Given that a country has deaths, in which prio grids will they be? 
  # Which countries had deaths in which month? 
  tmp_data = cm_data_1[ged_dummy_sb == 1, c("country_id", "key_cm")]
  # tmp_data$id = paste(tmp_data$country_id, tmp_data$month_id)
  # pgm_data$id = paste(pgm_data$country_id, pgm_data$month_id)
  pgm_data_1 = prepare_model_pgm_forecast(pgm_data,S = S, forecast_date = forecast_date)
  data_stage_2 = pgm_data_1[key_cm %in% tmp_data$key_cm]
  
  # 3. Level: Given that a prio grid has deaths how many in particular? 
  
  data_stage_3 = data_stage_2[future_ged_dummy_sb>0]
  
  # Split the three stages up into train/test/calibration data sets 
  
  # Generally the last month in the dataset is used for prediction 
  test_data_stage_1 = data_stage_1[date_future == max(date_future)]
  
  # the one month before are used for calibration 
  day_before_test = sort(unique(data_stage_1$date_future),decreasing = T)[2]
  
  calibrate_data_stage_1 = data_stage_1[date_target == day_before_test]
  # and all observations before that are purely for training 
  train_data_stage_1 = data_stage_1[date_target <= (day_before_test - months(1))]
  
  test_data_stage_2 = data_stage_2[date_future == max(date_future)]
  calibrate_data_stage_2 = data_stage_2[date_target == day_before_test]
  train_data_stage_2 = data_stage_2[date_target <= (day_before_test - months(1))]
  
  test_data_stage_3 = data_stage_3[date_future == max(date_future)]
  calibrate_data_stage_3 = data_stage_3[date_target == day_before_test]
  train_data_stage_3 = data_stage_3[date_target <= (day_before_test - months(1))]
  
  # Complete dataset for calibration 
  pgm_calibrate_1 = pgm_data_1[date_target == day_before_test]
  cm_calibrate_1 =cm_data_1[date_target == day_before_test]
  # Complete dataset for testing 
  pgm_data_1 = pgm_data_1[date_future == max(date_future)]
  cm_data_1 =cm_data_1[date_future == max(date_future)]
  
  return(list(stage_1 = data_stage_1,stage_2 = data_stage_2,stage_3 = data_stage_3, 
              test_data_stage_1 = test_data_stage_1, 
              test_data_stage_2 = test_data_stage_2,
              test_data_stage_3 = test_data_stage_3, 
              train_data_stage_1 = train_data_stage_1, 
              train_data_stage_2 = train_data_stage_2, 
              train_data_stage_3 = train_data_stage_3, 
              calibrate_data_stage_1 = calibrate_data_stage_1,
              calibrate_data_stage_2 = calibrate_data_stage_2, 
              calibrate_data_stage_3 = calibrate_data_stage_3, 
              cm_data_comp = cm_data_1, 
              pgm_data_comp = pgm_data_1,
              pgm_calibrate_1 = pgm_calibrate_1, cm_calibrate_1 = cm_calibrate_1))
}





prepare_model_cm_forecast = function(cm_data,S, forecast_date){
  
  cm_data = cm_data[order(country_id,month_id)] 
  # Ordering by month_id should naturally already be done before hand 
  month_max =  max(cm_data$month_id)
  cm_data[, future_target:=c(ged_dummy_sb[-(1:S)],rep(NA, S)), by=country_id]
  cm_data[, date_target:=c(date[-(1:S)],rep(NA, S)), by=country_id]
  cm_data$date_future = cm_data$date + months(S)
  cm_data_additional = cm_data[date_future == forecast_date]
  cm_data_additional$ged_dummy_sb = cm_data_additional$future_target
  
  cm_data_lag = cm_data[!is.na(future_target)]
  cm_data_lag$ged_dummy_sb = cm_data_lag$future_target
  # 
  return(rbind(cm_data_lag, cm_data_additional))
  #   Test
  #   tail(data_1[,1:10])
  #   tail(cm_data[,1:10])
}

# # 
# unique_1_cm = unique(cm_data$key_cm)
# unique_2_cm = unique(cm_data_lag$key_cm)
# 
# unique_1_cm[!unique_1_cm %in% unique_2_cm]
# unique_2_cm[!unique_2_cm %in% unique_1_cm]
# 
# 
# country_id,month_id
prepare_model_pgm_forecast = function(pgm_data,S, forecast_date){
  pgm_data = pgm_data[order(pg_id,month_id)] # Ordering by month_id should naturally already be done before hand 
  # pgm_data = pgm_data[order(pg_id)] # Ordering by country_id is of utmost importance!!!!
  # month_min = min(pgm_data$month_id)
  month_max =  max(pgm_data$month_id)
  # var_ind = which((names(pgm_data) == "ged_best_sb") | (names(pgm_data) == "month_id") | (names(pgm_data) == "ged_dummy_sb"))
  # 
  pgm_data[, future_ged_dummy_sb:=c(ged_dummy_sb[-(1:S)],rep(NA, S)), by=pg_id]
  pgm_data[, future_ged_best_sb:=c(ged_best_sb[-(1:S)],rep(NA, S)), by=pg_id]
  pgm_data[, date_target:=c(date[-(1:S)],rep(NA, S)), by=pg_id]
  pgm_data$date_future = pgm_data$date + months(S)
  
  pgm_data_lag = pgm_data[!is.na(future_ged_dummy_sb)]
  pgm_data_additional = pgm_data[date_future == forecast_date]
  
  return(rbind(pgm_data_lag,pgm_data_additional))
  
  #   Test
  # tail(data_1[,1:10])
  # tail(pgm_data[,1:10])
}


